In [1]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from faiq_utils import plot_wordcloud, barplot_cvec, \
                       stem_text

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
%matplotlib inline
In [41]:
# download stopwords for NLTK
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faiq0913/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[41]:
True
In [2]:
df_transaction = pd.read_csv('../data/bank_transaction.csv', parse_dates=['txn_date'])
df_user = pd.read_csv('../data/user_profile.csv')
In [3]:
df_transaction.head(3)
Out[3]:
client_id bank_id account_id txn_id txn_date description amount category
0 1 1 1 4 2023-09-29 Earnin PAYMENT Donatas Danyal 20.0 Loans
1 1 1 1 3 2023-08-14 ONLINE TRANSFER FROM NDonatas DanyalDA O CARSON BUSINESS CHECKING 1216 1216 25.0 Transfer Credit
2 1 1 1 5 2023-09-25 MONEY TRANSFER AUTHORIZED ON 09/25 FROM Earnin CDAEJ_B CA S583269001208168 111 20.0 Loans

Here, we notice several missing values in the category column. This will prompt further investigation below.

In [4]:
df_transaction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258779 entries, 0 to 258778
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   client_id    258779 non-null  int64         
 1   bank_id      258779 non-null  int64         
 2   account_id   258779 non-null  int64         
 3   txn_id       258779 non-null  int64         
 4   txn_date     258779 non-null  datetime64[ns]
 5   description  258779 non-null  object        
 6   amount       258779 non-null  float64       
 7   category     258522 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(4), object(2)
memory usage: 15.8+ MB
In [5]:
df_user.head(3)
Out[5]:
CLIENT_ID IS_INTERESTED_INVESTMENT IS_INTERESTED_BUILD_CREDIT IS_INTERESTED_INCREASE_INCOME IS_INTERESTED_PAY_OFF_DEBT IS_INTERESTED_MANAGE_SPENDING IS_INTERESTED_GROW_SAVINGS
0 1 False False False False False False
1 2 False False False False False False
2 3 False False False False False False
In [6]:
df_user.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   CLIENT_ID                      1000 non-null   int64
 1   IS_INTERESTED_INVESTMENT       1000 non-null   bool 
 2   IS_INTERESTED_BUILD_CREDIT     1000 non-null   bool 
 3   IS_INTERESTED_INCREASE_INCOME  1000 non-null   bool 
 4   IS_INTERESTED_PAY_OFF_DEBT     1000 non-null   bool 
 5   IS_INTERESTED_MANAGE_SPENDING  1000 non-null   bool 
 6   IS_INTERESTED_GROW_SAVINGS     1000 non-null   bool 
dtypes: bool(6), int64(1)
memory usage: 13.8 KB

1.0 Analyze bank_transaction.csv

1.1 Check on null values in category column & amount column distribution

We find 0.1% of transactions aren't classified into any categories and come from 2 clients with ID's 880 and 788. When we look into the amount of money being transacted, they fall well within the range of they fall within $200 though half of them are money coming in. This is in start contrast to transactions with no null values in their categories with the majority of money going out.

More data will be required to ascertain why these clients in particular are facing this issue. Hence, in our preprocessing, we'll choose to drop these transactions, especially since they account for such a small amount.

In [7]:
df_transaction.loc[df_transaction['category'].isnull()].head(10)
Out[7]:
client_id bank_id account_id txn_id txn_date description amount category
63984 788 1 1 115 2023-07-23 19:00:00 Transfer from Chime Savings Account 2.136 NaN
63993 788 1 1 114 2023-07-15 19:00:00 Transfer from Chime Savings Account 76.000 NaN
64000 788 1 1 121 2023-08-07 19:00:00 Transfer from Chime Savings Account 4.200 NaN
64001 788 1 1 122 2023-07-20 19:00:00 Cash app*cash out visa direct caus 2.150 NaN
64002 788 1 1 82 2023-08-15 19:00:00 Transfer from Chime Savings Account 2.000 NaN
64006 788 1 1 9 2023-08-10 19:00:00 Transfer from CB 165.178 NaN
64009 788 1 1 37 2023-07-24 19:00:00 Cash app*cash out visa direct caus 3.930 NaN
64010 788 1 1 36 2023-07-20 19:00:00 Transfer from Chime Savings Account 1.102 NaN
64012 788 1 1 34 2023-08-15 19:00:00 Transfer from Chime Savings Account 3.456 NaN
64014 788 1 1 125 2023-08-12 19:00:00 Transfer from Chime Savings Account 1.906 NaN
In [8]:
# Calculate the percentage the number of null rows in the 'category' column and total number of rows
null_percentage = (df_transaction['category'].isnull().sum() / len(df_transaction)) * 100

# Print the result
print(f"Percentage of null values in 'category': {null_percentage:.2f}%")
Percentage of null values in 'category': 0.10%
In [9]:
df_transaction.loc[df_transaction['category'].isnull()]['client_id'].value_counts()
Out[9]:
client_id
880    186
788     71
Name: count, dtype: int64
In [10]:
df_user.loc[df_user['CLIENT_ID'].isin([880, 788])]
Out[10]:
CLIENT_ID IS_INTERESTED_INVESTMENT IS_INTERESTED_BUILD_CREDIT IS_INTERESTED_INCREASE_INCOME IS_INTERESTED_PAY_OFF_DEBT IS_INTERESTED_MANAGE_SPENDING IS_INTERESTED_GROW_SAVINGS
787 788 False False False False False False
879 880 False False False False False False
In [7]:
df_transaction.loc[(df_transaction['client_id'] == 880) & (~df_transaction['category'].isnull())].head(10)
Out[7]:
client_id bank_id account_id txn_id txn_date description amount category
68274 880 828 953 169008 2023-07-03 Point of Sale Debit L340 DATE 06-30 ERENTERPLAN COM PS://WWWER -6.634 Insurance
68275 880 829 954 74551 2023-09-18 CHECK111 -19.674 Uncategorized
68276 880 828 953 64342 2023-06-19 Point of Sale Debit L340 DATE 06-17 CASH APP*ANNIE TOT877-417-4551 -7.000 Third Party
68277 880 828 953 26519 2023-08-18 Point of Sale Debit L343 TIME 11:31 AM DATE 08-17 7-ELEVEN RALEIGH -8.752 Gas Stations
68278 880 828 953 4411 2023-06-16 Point of Sale Debit L340 DATE 06-15 STORE*CHICK-FIL-A WWW DOORDASH -8.378 Restaurants
68279 880 828 953 117747 2023-07-03 Point of Sale Debit L340 DATE 06-30 SPI*DUKE-ENERGY 1036 -37.922 Utilities
68280 880 828 953 132382 2023-06-13 Point of Sale Debit L340 DATE 06-12 CASH APP*ANNIE TOT877-417-4551 -5.000 Third Party
68281 880 828 953 143329 2023-08-11 Point of Sale Debit L340 DATE 08-10 CASH APP*AMY FULLE8774174551 -9.000 Third Party
68282 880 828 953 92683 2023-06-19 Point of Sale Debit L340 DATE 06-17 SQ *JEWEL S CRYSTAJacksonville -2.568 Uncategorized
68283 880 828 953 94298 2023-08-28 Point of Sale Debit L340 DATE 08-27 CASH APP*AMY FULLE8774174551 -6.600 Third Party
In [8]:
df_transaction.loc[(df_transaction['client_id'] == 788) & (~df_transaction['category'].isnull())].head(10)
Out[8]:
client_id bank_id account_id txn_id txn_date description amount category
63985 788 1 1 22 2023-06-25 19:00:00 Transfer from Chime Savings Account 16.000 Transfer Credit
63986 788 1 1 21 2023-06-10 19:00:00 Transfer from Chime Savings Account 90.000 Transfer Credit
63987 788 1 1 30 2023-06-11 19:00:00 Transfer from Chime Savings Account 3.306 Transfer Credit
63988 788 1 1 123 2023-09-26 19:00:00 Transfer from Chime Savings Account 2.198 Transfer Credit
63989 788 1 1 110 2023-06-10 19:00:00 Transfer from Chime Savings Account 4.782 Transfer Credit
63990 788 1 1 111 2023-09-21 19:00:00 Transfer from Chime Savings Account 1.188 Transfer Credit
63991 788 1 1 112 2023-06-10 19:00:00 Transfer from Chime Savings Account 1.998 Transfer Credit
63992 788 1 1 113 2023-07-10 19:00:00 Transfer from Chime Savings Account 8.000 Transfer Credit
63994 788 1 1 23 2023-06-04 19:00:00 Transfer from Chime Savings Account 2.000 Transfer Credit
63995 788 1 1 126 2023-07-14 19:00:00 Transfer from Chime Savings Account 1.100 Transfer Credit
In [12]:
df_transaction.loc[df_transaction['category'].isnull()]['bank_id'].value_counts()
Out[12]:
bank_id
862    180
1       71
30       6
Name: count, dtype: int64

The descriptions don't point to anything significant either for why the category column is null.

In [13]:
df_transaction.loc[df_transaction['category'].isnull()]['description'].value_counts()
Out[13]:
description
Transfer from Chime Savings Account                                                                    116
Dandy Mini Mart                                                                                         28
Dunkin Donuts                                                                                           14
Transfer to Chime Savings Account                                                                       13
McDonald's                                                                                              13
Cash App*Maryse Hemant                                                                                  11
ATM Withdrawal                                                                                          10
Cash app*cash out      visa direct  caus                                                                 9
Wendy's                                                                                                  6
Transfer from CB                                                                                         6
Transfer to CB                                                                                           6
Dollar General                                                                                           5
Walmart                                                                                                  4
Subway                                                                                                   2
Sunoco                                                                                                   2
MERCHANT PAYMENT - 510001 UNITED DAIRY FA Maryse Hemant KY ON 062623 FROM CARD#: 3168                    1
5/3 JEANIE DEPOSIT 2306261 - 004463 5002 OLD TAYLR MILL Maryse Hemant KY ON 062623 FROM CARD#: 3168      1
ENOVA LOC NETCRDIS 3168 062623                                                                           1
PYMT TO RUMPKE WEB RECUR - 003 RUMPKE WEB RECURRING CINCINNATI OH                                        1
ELECTRONIC IMAGE                                                                                         1
Cash Withdrawal Fee                                                                                      1
KFC                                                                                                      1
Store                                                                                                    1
INTERBANK TRANSFER TO 000:1036 - igit01 Maryse Hemant NY                                                 1
Progressive                                                                                              1
Arby's                                                                                                   1
Maryse Hemant                                                                                            1
Name: count, dtype: int64
In [14]:
# Plot distribution of the 'amount' column for Null Categories
plt.figure(figsize=(10, 6))
sns.histplot(df_transaction.loc[df_transaction['category'].isnull()]['amount'], bins=10, color='skyblue', edgecolor='black')
plt.title("Distribution of Transaction Amounts (Null Categories)", fontsize=16, weight='bold')
plt.xlabel("Amount", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
In [15]:
df_transaction.loc[~df_transaction['category'].isnull()]['amount'].describe()
Out[15]:
count    258522.000000
mean          2.549586
std          81.164803
min       -9162.460000
25%          -6.000000
50%          -1.880000
75%           2.000000
max        9397.830000
Name: amount, dtype: float64
In [16]:
# Plot distribution of the 'amount' column
plt.figure(figsize=(10, 6))
sns.histplot(df_transaction.loc[~df_transaction['category'].isnull()]['amount'], bins=20, color='red', edgecolor='black')
plt.title("Distribution of Transaction Amounts (Non-Null Categories)", fontsize=16, weight='bold')
plt.xlabel("Amount", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

With the analysis above, we decide to drop rows with null values in the category column.

In [15]:
df_transaction = df_transaction.loc[~df_transaction['category'].isnull()]
1.2 ID and Transaction date analysis

With only 880 clients (out of 1000) making transactions, there is an opportunity to reach out to these clients to get them to be more active.

In [19]:
print(f'No. of clients with transactions: {df_transaction.client_id.nunique()}')
No. of clients with transactions: 880
In [20]:
print(f'No. of banks: {df_transaction.bank_id.nunique()}')
No. of banks: 990
In [21]:
print(f'No. of accounts: {df_transaction.account_id.nunique()}')
No. of accounts: 1131
In [22]:
print(f'No. of transactions: {df_transaction.txn_id.nunique()}')
No. of transactions: 190319

From the word cloud below, we notice that there's a seasonal pattern for transaction volume in a particular week. For example, the number of transactions are highest on Mondays, followed by Fridays. They are at their lowest on the weekends.

When building our classifier, perhaps there's different type of transactions that are being done over weekdays and weekends which may be a very useful feature when developing our model. We'll explore this below when we compare the top n-grams that appear between the different days.

In [23]:
# Count the number of transactions per date
txn_count = df_transaction["txn_date"].dt.date.value_counts().sort_index()

# Plot the distribution
plt.figure(figsize=(30, 6))
txn_count.plot(kind="bar", color="skyblue")
plt.title("Distribution of Transaction Dates", fontsize=14)
plt.xlabel("Transaction Date", fontsize=12)
plt.ylabel("Number of Transactions", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
1.3 WordClouds & Pre-processing

For pre-processing, we aim to drop rows with no meaningful words. I define rows with no meaningful words as:

  • Rows comprised of only stopwords (noise)
  • Rows with only numbers/non-alphanumerics

To do this, I'll aim to create WordClouds to identify the top words among the 7 most sizeable categories (due to them accounting for 60% of trasactions) and look out for transactions that aren't indicative of intent in their description.

From our analysis, we find the following:

  • The word Maryse Hemant appears at the top 3 across several categories e.g., Uncategorized, Third Party, Restaurants, etc.
  • Additionally, the word CHECK111 also appears in several transactions as the only word in the description. We treat this as noise and will remove rows containing only this word in our pre-processing.
  • The word CASH APP seems to be strongly tied to the Third Party category, while PURCHASE AUTHORIZED is to Restaurants/SuperMarkets and Groceries/Convenience Stores, Savings Accounts and Chime Savings is to Transfer Credit, as well as MoneyLion and InstaCash is to Loans.

Once, we've identified our stopwords, we pre-process our datasets with the following steps to generate a description_stem column for n-gram analysis in a later section.

In [25]:
df_transaction.category.value_counts(normalize=True)
Out[25]:
category
Uncategorized                 0.113692
Third Party                   0.111070
Restaurants                   0.101991
Transfer Credit               0.083401
Loans                         0.075835
Convenience Stores            0.072063
Supermarkets and Groceries    0.064791
Transfer Debit                0.058463
Gas Stations                  0.049973
Internal Account Transfer     0.046352
Payroll                       0.031332
Shops                         0.028694
Bank Fees                     0.024880
Transfer                      0.024273
ATM                           0.021940
Transfer Deposit              0.019248
Digital Entertainment         0.017503
Utilities                     0.015929
Clothing and Accessories      0.012339
Department Stores             0.007744
Insurance                     0.006785
Service                       0.003520
Arts and Entertainment        0.001536
Travel                        0.001420
Food and Beverage Services    0.001327
Interest                      0.001083
Check Deposit                 0.000816
Healthcare                    0.000801
Telecommunication Services    0.000615
Gyms and Fitness Centers      0.000267
Payment                       0.000159
Bank Fee                      0.000139
Tax Refund                    0.000019
Name: proportion, dtype: float64
In [26]:
sizeable_categories = [
    'Uncategorized',
    'Third Party',
    'Restaurants',
    'Transfer Credit',
    'Loans',
    'Convenience Stores',
    'Supermarkets and Groceries',
]

for x in sizeable_categories:
    plot_wordcloud(
        category=x,
        title=f'Most Frequently Used Words among the {x} Category',
        df=df_transaction,
        text_col_in_df='description',
        max_words=500,
    )
In [27]:
df_transaction.loc[df_transaction['description'].str.contains('Maryse|Maryse Hemant|Hemant')]
Out[27]:
client_id bank_id account_id txn_id txn_date description amount category
995 11 1 1 104 2023-09-21 CASH APP*Maryse Hemant 09/21 #1872 PMNT RCVD CASH APP*Maryse Hemant SAN FRANCISCO CA 19.650 Third Party
996 11 1 2 88 2023-07-07 Zelle Transfer Conf# g0000YiSW; Maryse Hemant et al v Google LLC Class Action S 19.076 Digital Entertainment
997 11 1 2 87 2023-09-01 Zelle payment from Maryse Hemant Conf# 99a1jntmk 12.000 Third Party
1000 11 1 1 111 2023-06-28 STASH CAPITAL (S DES:ACH ID:1872 INDN:Maryse Hemant CO ID:1872 PPD 8.000 Uncategorized
1001 11 1 1 110 2023-08-04 EMPLOYCO II, INC DES:PAYROLL ID:032398 INDN:Maryse Hemant CO ID:Maryse Hemant 210.656 Payroll
... ... ... ... ... ... ... ... ...
258760 880 489 558 113949 2023-07-17 Maryse Hemant TO STEPHEN ON 1036 -10.000 Uncategorized
258761 880 489 558 40378 2023-07-12 APLFCU CK WEBXFR TRANSFER 230712 1036 Maryse Hemant -62.000 Transfer Debit
258768 880 488 557 95989 2023-09-25 SHOPPERS FOOD/ 09/25 #Maryse Hemant FOOD/PHA BALTIMORE MD -7.196 Supermarkets and Groceries
258769 880 488 557 147385 2023-07-17 WM SUPERCENTER 07/16 #3168 PURCHASE Wal-Mart Store Maryse Hemant -20.476 Supermarkets and Groceries
258770 880 488 557 118034 2023-08-08 BKOFAMMaryse Hemant ATM 08/08 #3168 WITHDRWL MIDDLESEX BALTIMORE MD -4.000 ATM

37969 rows × 8 columns

In [28]:
df_transaction.loc[df_transaction['description'].str.contains('Maryse|Maryse Hemant|Hemant')].category.value_counts(normalize=True)
Out[28]:
category
Uncategorized                 0.186758
Third Party                   0.185177
Transfer Debit                0.147989
Loans                         0.100213
Restaurants                   0.073349
Convenience Stores            0.062024
Gas Stations                  0.046933
Supermarkets and Groceries    0.044168
ATM                           0.026653
Transfer Credit               0.022992
Transfer                      0.018778
Clothing and Accessories      0.015039
Digital Entertainment         0.014486
Shops                         0.012036
Bank Fees                     0.010245
Insurance                     0.008876
Department Stores             0.006321
Utilities                     0.005136
Service                       0.004346
Transfer Deposit              0.002028
Payroll                       0.001607
Travel                        0.001317
Healthcare                    0.000895
Food and Beverage Services    0.000816
Internal Account Transfer     0.000737
Telecommunication Services    0.000658
Arts and Entertainment        0.000211
Gyms and Fitness Centers      0.000158
Payment                       0.000026
Bank Fee                      0.000026
Name: proportion, dtype: float64
In [29]:
df_transaction.loc[df_transaction['description'].str.contains('CHECK111')]
Out[29]:
client_id bank_id account_id txn_id txn_date description amount category
967 10 1 1 31 2023-08-14 CHECK111 18.784 Clothing and Accessories
1221 13 1 1 78 2023-09-20 CHECK111 6.672 Uncategorized
1609 18 1 1 5 2023-08-03 CHECK111 2.068 Clothing and Accessories
1710 18 1 1 107 2023-09-05 CHECK111 7.836 Clothing and Accessories
7881 79 1 1 126 2023-08-10 CHECK111 1.534 Supermarkets and Groceries
... ... ... ... ... ... ... ... ...
258767 880 488 557 17761 2023-09-26 CHECK111 -0.908 Restaurants
258771 880 488 557 152928 2023-09-07 CHECK111 -2.000 Gas Stations
258772 880 488 557 107883 2023-09-22 CHECK111 -1.542 Restaurants
258774 880 488 557 19800 2023-08-14 CHECK111 -2.664 Restaurants
258775 880 488 557 718 2023-07-03 CHECK111 -8.842 Supermarkets and Groceries

6697 rows × 8 columns

In [30]:
df_transaction.loc[df_transaction['description'].str.contains('CHECK111')].category.value_counts(normalize=True)
Out[30]:
category
Restaurants                   0.208153
Gas Stations                  0.162909
Supermarkets and Groceries    0.151411
Uncategorized                 0.147081
Convenience Stores            0.133194
Shops                         0.056593
Loans                         0.033746
Digital Entertainment         0.027027
Third Party                   0.018217
Insurance                     0.014185
Clothing and Accessories      0.013737
Utilities                     0.011348
Service                       0.009557
Department Stores             0.007167
Bank Fees                     0.003584
Healthcare                    0.001195
Food and Beverage Services    0.000597
Travel                        0.000149
Arts and Entertainment        0.000149
Name: proportion, dtype: float64
In [31]:
df_transaction.loc[df_transaction['description'].str.contains('MoneyLion')]
Out[31]:
client_id bank_id account_id txn_id txn_date description amount category
15 2 1 2 56 2023-09-16 MoneyLion Instacash 10.000 Loans
16 2 1 2 21 2023-08-16 MoneyLion Instacash 19.000 Loans
18 2 1 2 69 2023-08-14 MoneyLion Instacash 20.000 Loans
19 2 1 2 68 2023-09-26 MoneyLion Instacash 20.000 Loans
23 2 1 2 62 2023-09-16 MoneyLion Instacash 20.000 Loans
... ... ... ... ... ... ... ... ...
252340 880 543 622 56551 2023-09-18 MoneyLion Plus Loan -4.488 Loans
254853 880 296 332 168523 2023-07-21 MoneyLion Plus Loan -8.972 Loans
256005 880 254 286 177620 2023-06-16 MoneyLion PAYMENT PPD ID: 1036 -1.028 Loans
256020 880 254 286 73216 2023-06-27 MoneyLion PAYMENT PPD ID: 1036 -1.020 Loans
256337 880 282 318 120960 2023-07-17 MoneyLion Plus Membership -1.998 Uncategorized

3194 rows × 8 columns

In [32]:
df_transaction.loc[df_transaction['description'].str.contains('MoneyLion')].category.value_counts(normalize=True)
Out[32]:
category
Loans                        0.982154
Uncategorized                0.016594
Internal Account Transfer    0.000939
Payroll                      0.000313
Name: proportion, dtype: float64

We begin our pre-processing step below. We use a helper function stem_text() to do this, as well as remove transactions that are only comprised of 'CHECK111'. The stem_text() function performs the following:

  • Non-Letter Removal: Removes characters that are not letters (e.g., numbers, punctuation, special characters) using a regular expression.
  • Case Normalization: Converts all text to lowercase.
  • Tokenization: Splits the text into individual words.
  • Stopword Removal: Removes common stopwords (e.g., "and", "the") using the NLTK stopwords list, with an option to include additional custom stopwords from the stopwords_list parameter.
  • Stemming: Reduces words to their root form using the Porter Stemmer (e.g., "running" becomes "run").
In [9]:
df_transaction = df_transaction.loc[~df_transaction['description'] == 'CHECK111']
In [10]:
stopwords_list = ['maryse', 'maryse hemant', 'hemant']

# Pre-process the raw text for uni-gram/bi-gram analysis
df_transaction['description_stem'] = df_transaction['description'].map(lambda x: stem_text(raw_text=x, stopwords_list=stopwords_list))
/home/faiq0913/Personal-Projects/moneylion-financial-transaction-classification/notebooks/faiq_utils.py:31: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
  review_text = BeautifulSoup(raw_text, "html.parser").get_text()
In [18]:
df_transaction.head(3)
Out[18]:
client_id bank_id account_id txn_id txn_date description amount category description_stem
0 1 1 1 4 2023-09-29 Earnin PAYMENT Donatas Danyal 20.0 Loans earnin payment donata danyal
1 1 1 1 3 2023-08-14 ONLINE TRANSFER FROM NDonatas DanyalDA O CARSON BUSINESS CHECKING 1216 1216 25.0 Transfer Credit onlin transfer ndonata danyalda carson busi check
2 1 1 1 5 2023-09-25 MONEY TRANSFER AUTHORIZED ON 09/25 FROM Earnin CDAEJ_B CA S583269001208168 111 20.0 Loans money transfer author earnin cdaej b ca
In [11]:
# Reindex the dataframe
df_transaction.reset_index(drop=True, inplace=True)
In [12]:
df_transaction.loc[3]['description']
Out[12]:
'ONLINE TRANSFER FROM CARSON N EVERYDAY CHECKING 1216 1216'
In [13]:
df_transaction.loc[3]['description_stem']
Out[13]:
'onlin transfer carson n everyday check'
1.4 Remove reviews that do not have any meaningful words

As stated in the previous section, for pre-processing, we aim to drop rows with no meaningful words. These are:

  • Rows comprised of only stopwords (noise)
  • Rows with only numbers/non-alphanumerics
In [14]:
# Find the number of meaningful words in each transaction
df_transaction['description_clean_len'] = df_transaction['description_stem'].str.split().map(len)
In [20]:
# There are 8 transactions that do not have any meaningful words
df_transaction[df_transaction['description_clean_len']==0].shape
Out[20]:
(2749, 10)
In [21]:
df_transaction[df_transaction['description_clean_len']==0]
Out[21]:
client_id bank_id account_id txn_id txn_date description amount category description_stem description_clean_len
15476 156 1 1 139 2023-07-20 00:00:00 Maryse Hemant 258597 07/21 22.000 Loans 0
15536 156 1 1 135 2023-07-07 00:00:00 Maryse Hemant: 270 80.000 Loans 0
16528 165 1 1 15 2023-08-31 00:00:00 Maryse Hemant 41.638 Restaurants 0
16536 165 1 1 21 2023-06-09 00:00:00 Maryse Hemant 51.384 Payroll 0
16543 165 1 1 3 2023-07-06 00:00:00 Maryse Hemant 41.736 Payroll 0
... ... ... ... ... ... ... ... ... ... ...
257723 880 520 596 445 2023-06-27 00:00:00 Maryse Hemant -16.000 Uncategorized 0
257872 880 477 543 173174 2023-08-10 07:52:05 Maryse Hemant -0.656 Restaurants 0
258020 880 472 538 21491 2023-06-05 00:00:00 Maryse Hemant TO Maryse Hemant Maryse Hemant ON 1036 -143.000 Uncategorized 0
258077 880 313 349 17350 2023-08-11 00:00:00 Maryse Hemant -55.440 Restaurants 0
258468 880 497 568 166735 2023-06-24 19:00:00 Maryse Hemant -4.006 Department Stores 0

2749 rows × 10 columns

In [15]:
# Drop the transactions that do not have any meaningful words
df_transaction = df_transaction.drop(df_transaction[df_transaction['description_clean_len']==0].index)
In [16]:
# Reindex the dataframe
df_transaction.reset_index(drop=True, inplace=True)
1.5 N-grams for sizeable categories & Weekend vs Weekday transactions

We noticed eaerlier that there was difference in frequency of transactions across time e.g., the no. of transactions on weekends was significantly less incomparion to weekdays (especially Mondays and Fridays). Hence, it may be useful to compare n-grams of transactions that occur on the different timeframes. Unfortunately, due to compute constraints, we're unable to create n-grams for the entire dataframe. Hence, we'll sample these transactions for simplicity.

In [47]:
barplot_cvec(
    df=df_transaction[df_transaction["txn_date"].dt.day_name().isin(['Monday'])],
    titles=[f'Top 20 Uni-grams - Monday', f'Top 20 Bi-grams - Monday', f'Top 20 Tri-grams - Monday'],
    color='#CB4335',
    xlimit=(0,2500)
)
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
Cell In[47], line 1
----> 1 barplot_cvec(
      2     df=df_transaction[df_transaction["txn_date"].dt.day_name().isin(['Monday'])],
      3     titles=[f'Top 20 Uni-grams - Monday', f'Top 20 Bi-grams - Monday', f'Top 20 Tri-grams - Monday'],
      4     color='#CB4335',
      5     xlimit=(0,2500)
      6 )

File ~/Personal-Projects/moneylion-financial-transaction-classification/notebooks/faiq_utils.py:92, in barplot_cvec(df, titles, color, xlimit, category)
     89 cvec = CountVectorizer(stop_words='english', ngram_range=ngram[i])
     91 # Save the tokens in a dataframe
---> 92 cvec_df = pd.DataFrame(cvec.fit_transform(words_series).todense(), columns=cvec.get_feature_names_out())
     93 sum_words = cvec_df.sum(axis=0) # Sum up the no. of occurences for each word
     94 top_words = sum_words.sort_values(ascending=False).head(20)

File ~/Personal-Projects/moneylion-financial-transaction-classification/.venv/lib/python3.11/site-packages/scipy/sparse/_base.py:932, in _spbase.todense(self, order, out)
    902 def todense(self, order=None, out=None):
    903     """
    904     Return a dense representation of this sparse array/matrix.
    905 
   (...)
    930         `numpy.matrix` object that shares the same memory.
    931     """
--> 932     return self._ascontainer(self.toarray(order=order, out=out))

File ~/Personal-Projects/moneylion-financial-transaction-classification/.venv/lib/python3.11/site-packages/scipy/sparse/_compressed.py:1056, in _cs_matrix.toarray(self, order, out)
   1054 if out is None and order is None:
   1055     order = self._swap('cf')[0]
-> 1056 out = self._process_toarray_args(order, out)
   1057 if not (out.flags.c_contiguous or out.flags.f_contiguous):
   1058     raise ValueError('Output array must be C or F contiguous')

File ~/Personal-Projects/moneylion-financial-transaction-classification/.venv/lib/python3.11/site-packages/scipy/sparse/_base.py:1287, in _spbase._process_toarray_args(self, order, out)
   1285     return out
   1286 else:
-> 1287     return np.zeros(self.shape, dtype=self.dtype, order=order)

MemoryError: Unable to allocate 17.5 GiB for an array with shape (63061, 37341) and data type int64

We notice a lot more spending in purchases/purchase authorized (which are normally associated with Restaurants/SuperMarkets and Groceries/Convenience Stores transactions) being made on Mondays. This clues us in that maybe most customers prefer to do their groceries on Mondays. Debit Card Purchases being highest on Monday (in terms of 3-grams) also supports this. This in comparison to the weekend where a lot of cash transfers are being made (from the 3-grams).

Hence, for feature engineering, it may be wise to include a feature detailing what day of the week the transaction is being made on.

In [20]:
barplot_cvec(
    df=df_transaction[df_transaction["txn_date"].dt.day_name().isin(['Monday'])].sample(frac=0.2, random_state=42),
    titles=[f'Top 20 Uni-grams - Monday', f'Top 20 Bi-grams - Monday', f'Top 20 Tri-grams - Monday'],
    color='#CB4335',
    xlimit=(0,2500)
)
In [45]:
weekends = ["Sunday", "Saturday"]

barplot_cvec(
    df=df_transaction[df_transaction["txn_date"].dt.day_name().isin(weekends)],
    titles=[f'Top 20 Uni-grams - Weekends', f'Top 20 Bi-grams - Weekends', f'Top 20 Tri-grams - Weekends'],
    color='#CB4335',
    xlimit=(0,2500)
)
In [44]:
sizeable_categories = [
    'Uncategorized',
    'Third Party',
    'Restaurants',
    'Transfer Credit',
    'Loans',
    'Convenience Stores',
    'Supermarkets and Groceries',
]

# Plot the uni-grams, bi-grams and tri-grams for sizeable categories
for i in sizeable_categories:
    barplot_cvec(
        df=df_transaction,
        category=i,
        titles=[f'Top 20 Uni-grams - {i}', f'Top 20 Bi-grams - {i}', f'Top 20 Tri-grams - {i}'],
        color='#CB4335',
        xlimit=(0,2500)
    )

2.0 Analyze user_profile.csv

2.1 Descriptive Statistics
In [31]:
for x in df_user.columns:
    if x == 'CLIENT_ID':
        continue
    else:
        plt.figure(figsize=(8, 5))
        df_user.groupby(x)["CLIENT_ID"].count().plot(kind="bar", color=["skyblue", "orange"], edgecolor="black")
        plt.xlabel(f'Is Interested in {x.replace("IS_INTERESTED", "").replace("_", " ").title()}')
        plt.ylabel("Number of Clients")
        plt.title("Distribution of Clients Interested in Investment")
        plt.xticks([0, 1], ["False", "True"], rotation=0)
        plt.show()
In [46]:
cat_cols = [x for x in df_user.columns if x != 'CLIENT_ID']

for i in range(7):
    if i == 0:
        continue
    else:
        total_users = df_user[cat_cols].sum(axis=1).ge(i).sum()
        print(f'No. of users with {i} or more interests: {total_users} ({100 * total_users/1000:.2f}%)')
No. of users with 1 or more interests: 183 (18.30%)
No. of users with 2 or more interests: 87 (8.70%)
No. of users with 3 or more interests: 28 (2.80%)
No. of users with 4 or more interests: 11 (1.10%)
No. of users with 5 or more interests: 4 (0.40%)
No. of users with 6 or more interests: 1 (0.10%)
2.2 User Transaction Level
In [60]:
user_transaction_count = df_transaction.groupby('client_id').agg(
    # we divide by 3 since our data is from June 1st to 30th September
    monthly_average_transaction_count=('client_id', lambda x: round(x.count() / 3))
).reset_index()

user_transaction_count
Out[60]:
client_id monthly_average_transaction_count
0 1 2
1 2 26
2 3 51
3 4 59
4 5 9
... ... ...
875 876 21
876 877 58
877 878 19
878 879 3
879 880 60311

880 rows × 2 columns

In [63]:
# Plotting a histogram for monthly_average_transaction_count
plt.figure(figsize=(8, 5))
plt.hist(user_transaction_count["monthly_average_transaction_count"], bins=5, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel("Monthly Average Transaction Count")
plt.ylabel("Frequency")
plt.title("Histogram of Monthly Average Transaction Counts")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

3.0 Data Cleaning, Feature Engineering, Train-test Split & Save Datasets

We consolidate all our feature engineering and data cleaning steps in the below function, clean_split_dataset(). We break them down step-by-step:

  1. We first remove rows with null values in the category column from the bank_transaction.csv
  2. We then pre-process our dataset using the stem_text() function which performs: Non-Letter Removal:, Case Normalization:,Tokenization:, Stopword Removal:, Stemming:. We do this to identify transactions comprised of only stop words and non-alphanumerics (which we then remove as they are not meaningful).
Transaction-level features
  1. We create day related features to identify if the transaction happened on Friday, Monday or the weekend based on our analysis above.
  2. We also choose to keep the amount spent on the transaction. Sizeable amounts may be correlated to specific transaction categories.
User-level features
  1. From bank_transaction.csv, we create aggregate features to determine, on a monthly basis, how likely is a consumer make a transaciton of a particular category (and overall to determine activeness) based on their count (in the future, perhaps amount maybe better). This could potentially be an important feature if to determine if, for a particular user, a transacition category is more likely (in addition to the description column). We acknowledge that this may result in a sparse dataset, but it's a good first step for experimentation.
  2. From user_profile.csv, we adopt the intent columns in spite of their sparsity for experimentation.
Train-test Split

20% of the original dataset will be set aside and used as a test set. This will be useful in evaluating our model's performance on unseen data. We will also use stratification to preserve the class representation in our train and test set.

In [93]:
def clean_split_dataset():
    """Pre-processing and Feature Engineering function."""
    df_transaction = pd.read_csv('../data/bank_transaction.csv', parse_dates=['txn_date'])
    df_user = pd.read_csv('../data/user_profile.csv')

    ##### Transaction-level features #####
    df_transaction = df_transaction.loc[~df_transaction['category'].isnull()]
    df_transaction = df_transaction.loc[~df_transaction['description'].str.contains('CHECK111')]

    # Pre-process the raw text to remove stopwords
    stopwords_list = ['maryse', 'maryse hemant', 'hemant']
    df_transaction['description_stem'] = df_transaction['description'].map(lambda x: stem_text(raw_text=x, stopwords_list=stopwords_list))

    # Find the number of meaningful words in each transaction
    df_transaction['description_clean_len'] = df_transaction['description_stem'].str.split().map(len)

    # Drop the transactions that do not have any meaningful words  e.g., comprised of stopwords and numbers only
    df_transaction = df_transaction.drop(df_transaction[df_transaction['description_clean_len']==0].index)

    # Since we identified that the number of transactions differ between days of the week, we'll create features corresponding
    # to the day of the transaction
    df_transaction['day_monday'] = np.where(df_transaction["txn_date"].dt.day_name() == 'Monday', 1, 0)
    df_transaction['day_friday'] = np.where(df_transaction["txn_date"].dt.day_name() == 'Friday', 1, 0)
    df_transaction['day_weekend'] = np.where(df_transaction["txn_date"].dt.day_name().isin(['Saturday', 'Sunday']), 1, 0)

    ##### User-level features #####
    categories = list(df_transaction.category.unique())

    user_transaction_count = df_transaction.groupby('client_id').agg(
        monthly_transaction_count=('client_id', lambda x: round(x.count() / 3)),
        **{
            f"monthly_transaction_count_{str(category).lower().replace(' ', '_')}": (
                'category', 
                lambda x, cat=category: round((x == cat).sum() / 3)
            )
            for category in categories
        }
    ).reset_index()

    user_features = user_transaction_count.merge(df_user.astype(int), how='left', left_on='client_id', right_on='CLIENT_ID')
    user_features = user_features.drop(columns=['CLIENT_ID'])

    ##### Join User & Transaction-level features together #####
    df_final = df_transaction.merge(user_features, how='left', on='client_id')

    ##### Train-test Split
    # As we would like to stratify our target variable, we will need to first assign X and y
    X = df_final[[cols for cols in df_final.columns if cols != 'category']]
    y = df_final['category']

    # Perform a train_test_split to create a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Merge X_train and y_train back together using index
    train = pd.merge(X_train, y_train, left_index=True, right_index=True)

    # Merge X_test and y_test back together using index
    test = pd.merge(X_test, y_test, left_index=True, right_index=True)

    # Reindex the train and test set
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)

    print(f'No. of unique categories in Train: {train.category.nunique()}')
    print(f'No. of unique categories in Test: {test.category.nunique()}')
    print("")
    print(f'Train size: {train.shape}')
    print(f'Test size: {test.shape}')

    # Save clean training set
    train.to_csv('../data/train.csv', index=False)
    test.to_csv('../data/test.csv', index=False)

    print("")
    print('Finished preparing datasets!')

    return (train, test)
In [94]:
train, test = clean_split_dataset()
/home/faiq0913/Personal-Projects/moneylion-financial-transaction-classification/notebooks/faiq_utils.py:31: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
  review_text = BeautifulSoup(raw_text, "html.parser").get_text()
No. of unique categories in Train: 33
No. of unique categories in Test: 33

Train size: (199260, 53)
Test size: (49816, 53)

Finished preparing datasets!
In [95]:
train.head(3)
Out[95]:
client_id bank_id account_id txn_id txn_date description amount description_stem description_clean_len day_monday day_friday day_weekend monthly_transaction_count monthly_transaction_count_loans monthly_transaction_count_transfer_credit monthly_transaction_count_transfer_deposit monthly_transaction_count_payroll monthly_transaction_count_uncategorized monthly_transaction_count_restaurants monthly_transaction_count_check_deposit monthly_transaction_count_third_party monthly_transaction_count_food_and_beverage_services monthly_transaction_count_internal_account_transfer monthly_transaction_count_shops monthly_transaction_count_supermarkets_and_groceries monthly_transaction_count_telecommunication_services monthly_transaction_count_bank_fees monthly_transaction_count_utilities monthly_transaction_count_insurance monthly_transaction_count_digital_entertainment monthly_transaction_count_transfer monthly_transaction_count_gyms_and_fitness_centers monthly_transaction_count_clothing_and_accessories monthly_transaction_count_department_stores monthly_transaction_count_healthcare monthly_transaction_count_gas_stations monthly_transaction_count_service monthly_transaction_count_travel monthly_transaction_count_arts_and_entertainment monthly_transaction_count_convenience_stores monthly_transaction_count_interest monthly_transaction_count_tax_refund monthly_transaction_count_transfer_debit monthly_transaction_count_atm monthly_transaction_count_bank_fee monthly_transaction_count_payment IS_INTERESTED_INVESTMENT IS_INTERESTED_BUILD_CREDIT IS_INTERESTED_INCREASE_INCOME IS_INTERESTED_PAY_OFF_DEBT IS_INTERESTED_MANAGE_SPENDING IS_INTERESTED_GROW_SAVINGS category
0 577 1 2 90 2023-08-14 Online Banking transfer from CHK 6479 Confirmation# 1425 6.0 onlin bank transfer chk confirm 5 1 0 0 56 1 13 1 3 4 0 0 8 0 25 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 Internal Account Transfer
1 94 8 8 159 2023-09-18 DEPOSIT 5.0 deposit 1 1 0 0 136 23 4 15 17 15 0 0 0 0 56 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Payroll
2 880 770 887 99720 2023-06-16 Bulk 7202 Apple 1920 EAST WISCONSI APPLETON WIUS -10.9 bulk appl east wisconsi appleton wiu 6 0 1 0 60311 2789 806 118 0 7491 7586 1 6729 110 1801 2122 5170 23 2005 1314 511 1413 1878 19 797 616 50 3813 220 38 75 5773 93 0 5035 1891 12 14 0 0 0 0 0 0 Uncategorized

Let's do one final check on null values!

In [96]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199260 entries, 0 to 199259
Data columns (total 53 columns):
 #   Column                                                Non-Null Count   Dtype         
---  ------                                                --------------   -----         
 0   client_id                                             199260 non-null  int64         
 1   bank_id                                               199260 non-null  int64         
 2   account_id                                            199260 non-null  int64         
 3   txn_id                                                199260 non-null  int64         
 4   txn_date                                              199260 non-null  datetime64[ns]
 5   description                                           199260 non-null  object        
 6   amount                                                199260 non-null  float64       
 7   description_stem                                      199260 non-null  object        
 8   description_clean_len                                 199260 non-null  int64         
 9   day_monday                                            199260 non-null  int64         
 10  day_friday                                            199260 non-null  int64         
 11  day_weekend                                           199260 non-null  int64         
 12  monthly_transaction_count                             199260 non-null  int64         
 13  monthly_transaction_count_loans                       199260 non-null  int64         
 14  monthly_transaction_count_transfer_credit             199260 non-null  int64         
 15  monthly_transaction_count_transfer_deposit            199260 non-null  int64         
 16  monthly_transaction_count_payroll                     199260 non-null  int64         
 17  monthly_transaction_count_uncategorized               199260 non-null  int64         
 18  monthly_transaction_count_restaurants                 199260 non-null  int64         
 19  monthly_transaction_count_check_deposit               199260 non-null  int64         
 20  monthly_transaction_count_third_party                 199260 non-null  int64         
 21  monthly_transaction_count_food_and_beverage_services  199260 non-null  int64         
 22  monthly_transaction_count_internal_account_transfer   199260 non-null  int64         
 23  monthly_transaction_count_shops                       199260 non-null  int64         
 24  monthly_transaction_count_supermarkets_and_groceries  199260 non-null  int64         
 25  monthly_transaction_count_telecommunication_services  199260 non-null  int64         
 26  monthly_transaction_count_bank_fees                   199260 non-null  int64         
 27  monthly_transaction_count_utilities                   199260 non-null  int64         
 28  monthly_transaction_count_insurance                   199260 non-null  int64         
 29  monthly_transaction_count_digital_entertainment       199260 non-null  int64         
 30  monthly_transaction_count_transfer                    199260 non-null  int64         
 31  monthly_transaction_count_gyms_and_fitness_centers    199260 non-null  int64         
 32  monthly_transaction_count_clothing_and_accessories    199260 non-null  int64         
 33  monthly_transaction_count_department_stores           199260 non-null  int64         
 34  monthly_transaction_count_healthcare                  199260 non-null  int64         
 35  monthly_transaction_count_gas_stations                199260 non-null  int64         
 36  monthly_transaction_count_service                     199260 non-null  int64         
 37  monthly_transaction_count_travel                      199260 non-null  int64         
 38  monthly_transaction_count_arts_and_entertainment      199260 non-null  int64         
 39  monthly_transaction_count_convenience_stores          199260 non-null  int64         
 40  monthly_transaction_count_interest                    199260 non-null  int64         
 41  monthly_transaction_count_tax_refund                  199260 non-null  int64         
 42  monthly_transaction_count_transfer_debit              199260 non-null  int64         
 43  monthly_transaction_count_atm                         199260 non-null  int64         
 44  monthly_transaction_count_bank_fee                    199260 non-null  int64         
 45  monthly_transaction_count_payment                     199260 non-null  int64         
 46  IS_INTERESTED_INVESTMENT                              199260 non-null  int64         
 47  IS_INTERESTED_BUILD_CREDIT                            199260 non-null  int64         
 48  IS_INTERESTED_INCREASE_INCOME                         199260 non-null  int64         
 49  IS_INTERESTED_PAY_OFF_DEBT                            199260 non-null  int64         
 50  IS_INTERESTED_MANAGE_SPENDING                         199260 non-null  int64         
 51  IS_INTERESTED_GROW_SAVINGS                            199260 non-null  int64         
 52  category                                              199260 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(48), object(3)
memory usage: 80.6+ MB
In [97]:
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49816 entries, 0 to 49815
Data columns (total 53 columns):
 #   Column                                                Non-Null Count  Dtype         
---  ------                                                --------------  -----         
 0   client_id                                             49816 non-null  int64         
 1   bank_id                                               49816 non-null  int64         
 2   account_id                                            49816 non-null  int64         
 3   txn_id                                                49816 non-null  int64         
 4   txn_date                                              49816 non-null  datetime64[ns]
 5   description                                           49816 non-null  object        
 6   amount                                                49816 non-null  float64       
 7   description_stem                                      49816 non-null  object        
 8   description_clean_len                                 49816 non-null  int64         
 9   day_monday                                            49816 non-null  int64         
 10  day_friday                                            49816 non-null  int64         
 11  day_weekend                                           49816 non-null  int64         
 12  monthly_transaction_count                             49816 non-null  int64         
 13  monthly_transaction_count_loans                       49816 non-null  int64         
 14  monthly_transaction_count_transfer_credit             49816 non-null  int64         
 15  monthly_transaction_count_transfer_deposit            49816 non-null  int64         
 16  monthly_transaction_count_payroll                     49816 non-null  int64         
 17  monthly_transaction_count_uncategorized               49816 non-null  int64         
 18  monthly_transaction_count_restaurants                 49816 non-null  int64         
 19  monthly_transaction_count_check_deposit               49816 non-null  int64         
 20  monthly_transaction_count_third_party                 49816 non-null  int64         
 21  monthly_transaction_count_food_and_beverage_services  49816 non-null  int64         
 22  monthly_transaction_count_internal_account_transfer   49816 non-null  int64         
 23  monthly_transaction_count_shops                       49816 non-null  int64         
 24  monthly_transaction_count_supermarkets_and_groceries  49816 non-null  int64         
 25  monthly_transaction_count_telecommunication_services  49816 non-null  int64         
 26  monthly_transaction_count_bank_fees                   49816 non-null  int64         
 27  monthly_transaction_count_utilities                   49816 non-null  int64         
 28  monthly_transaction_count_insurance                   49816 non-null  int64         
 29  monthly_transaction_count_digital_entertainment       49816 non-null  int64         
 30  monthly_transaction_count_transfer                    49816 non-null  int64         
 31  monthly_transaction_count_gyms_and_fitness_centers    49816 non-null  int64         
 32  monthly_transaction_count_clothing_and_accessories    49816 non-null  int64         
 33  monthly_transaction_count_department_stores           49816 non-null  int64         
 34  monthly_transaction_count_healthcare                  49816 non-null  int64         
 35  monthly_transaction_count_gas_stations                49816 non-null  int64         
 36  monthly_transaction_count_service                     49816 non-null  int64         
 37  monthly_transaction_count_travel                      49816 non-null  int64         
 38  monthly_transaction_count_arts_and_entertainment      49816 non-null  int64         
 39  monthly_transaction_count_convenience_stores          49816 non-null  int64         
 40  monthly_transaction_count_interest                    49816 non-null  int64         
 41  monthly_transaction_count_tax_refund                  49816 non-null  int64         
 42  monthly_transaction_count_transfer_debit              49816 non-null  int64         
 43  monthly_transaction_count_atm                         49816 non-null  int64         
 44  monthly_transaction_count_bank_fee                    49816 non-null  int64         
 45  monthly_transaction_count_payment                     49816 non-null  int64         
 46  IS_INTERESTED_INVESTMENT                              49816 non-null  int64         
 47  IS_INTERESTED_BUILD_CREDIT                            49816 non-null  int64         
 48  IS_INTERESTED_INCREASE_INCOME                         49816 non-null  int64         
 49  IS_INTERESTED_PAY_OFF_DEBT                            49816 non-null  int64         
 50  IS_INTERESTED_MANAGE_SPENDING                         49816 non-null  int64         
 51  IS_INTERESTED_GROW_SAVINGS                            49816 non-null  int64         
 52  category                                              49816 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(48), object(3)
memory usage: 20.1+ MB